An object of class Seurat
13714 features across 2700 samples within 1 assay
Active assay: RNA (13714 features, 0 variable features)
visualize QC metrics, and use these to filter cells:
We filter cells that have unique feature counts over 2,500 or less than 200
We filter cells that have >5% mitochondrial counts
Feature-feature relationships:
Feature selection (2000 features):
PC_ 1
Positive: CST3, TYROBP, LST1, AIF1, FTL
Negative: MALAT1, LTB, IL32, IL7R, CD2
PC_ 2
Positive: CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1
Negative: NKG7, PRF1, CST7, GZMB, GZMA
PC_ 3
Positive: HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPB1
Negative: PPBP, PF4, SDPR, SPARC, GNG11
PC_ 4
Positive: HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1
Negative: VIM, IL7R, S100A6, IL32, S100A8
PC_ 5
Positive: GZMB, NKG7, S100A8, FGFBP2, GNLY
Negative: LTB, IL7R, CKB, VIM, MS4A7
Heatmap based on PCs:
Determine the dimensionality:
Based on the plot, we roughly choose the first 10 PCs
Visualizing use UMAP:
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 96033
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8720
Number of communities: 9
Elapsed time: 0 seconds
Markers for every cluster compared to all remaining cells:
| p_val | avg_logFC | pct.1 | pct.2 | p_val_adj | cluster | gene |
|---|---|---|---|---|---|---|
| 0 | 0.7300635 | 0.901 | 0.594 | 0 | 0 | LDHB |
| 0 | 0.9219135 | 0.436 | 0.110 | 0 | 0 | CCR7 |
| 0 | 0.8921170 | 0.981 | 0.642 | 0 | 1 | LTB |
| 0 | 0.8586034 | 0.422 | 0.110 | 0 | 1 | AQP3 |
| 0 | 3.8608733 | 0.996 | 0.215 | 0 | 2 | S100A9 |
| 0 | 3.7966403 | 0.975 | 0.121 | 0 | 2 | S100A8 |
| 0 | 2.9875833 | 0.936 | 0.041 | 0 | 3 | CD79A |
| 0 | 2.4894932 | 0.622 | 0.022 | 0 | 3 | TCL1A |
| 0 | 2.1220555 | 0.985 | 0.240 | 0 | 4 | CCL5 |
| 0 | 2.0461687 | 0.587 | 0.059 | 0 | 4 | GZMK |
| 0 | 2.2954931 | 0.975 | 0.134 | 0 | 5 | FCGR3A |
| 0 | 2.1388125 | 1.000 | 0.315 | 0 | 5 | LST1 |
| 0 | 3.3462278 | 0.961 | 0.068 | 0 | 6 | GZMB |
| 0 | 3.6898996 | 0.961 | 0.131 | 0 | 6 | GNLY |
| 0 | 2.6832771 | 0.812 | 0.011 | 0 | 7 | FCER1A |
| 0 | 1.9924275 | 1.000 | 0.513 | 0 | 7 | HLA-DPB1 |
| 0 | 5.0207262 | 1.000 | 0.010 | 0 | 8 | PF4 |
| 0 | 5.9443347 | 1.000 | 0.024 | 0 | 8 | PPBP |
Visualize using VlnPlot:
Visualize using FeaturePlot:
Plotting the top 20 markers (or all markers if less than 20) for each cluster use DoHeatmap:
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, comment = "")
library(dplyr)
library(Seurat)
library(patchwork)
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "filtered_gene_bc_matrices/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# filtering and normalizing
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
# feature selection
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
# scaling
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
# heatmap
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
# JackStraw
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)
ElbowPlot(pbmc)
# clustering
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.5)
# visualize result use UMAP
pbmc <- RunUMAP(pbmc, dims = 1:10)
DimPlot(pbmc, reduction = "umap")
# save files
saveRDS(pbmc, file = "pbmc_tutorial.rds")
# find markers for every cluster compared to all remaining cells, report only the positive ones
pbmc.markers = FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>%
group_by(cluster) %>%
top_n(n = 2, wt = avg_logFC) %>%
knitr::kable()
# Visualize using VlnPlot
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
# plot raw counts as well
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
# plotting the top 20 markers (or all markers if less than 20) for each cluster
top10 = pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
# assigning cell type identity to clusters
new.cluster.ids = c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono",
"NK", "DC", "Platelet")
names(new.cluster.ids) = levels(pbmc)
pbmc = RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
saveRDS(pbmc, file = "pbmc3k_final.rds")